curcomp <- "~/Dropbox/Boards/boards_rep_v2/"

# load all packages
setwd(curcomp)
suppressMessages(library(plyr))
suppressMessages(library(tidyverse))
suppressMessages(library(stringi))
suppressMessages(library(reticulate))
suppressMessages(library(lubridate))
suppressMessages(library(pbapply))
suppressMessages(library(parallel))
suppressMessages(library(Matrix))
suppressMessages(library(tm))
suppressMessages(library(readxl))

# Create a new python environment
conda_create("r-reticulate_boards_rep")
# Install packages
py_install(envname = "r-reticulate_boards_rep",
           packages = c("pandas","ftfy","scikit-learn"))
# Source fuzzy matching script
source_python("name_match_script.py")


######################
# This script generates the dataframes used in the analysis
###### Contents ######
# 1. Compustat
# 2. Interlocks
# 3. Sustainability officers
# 4. CDP reporting
# 5. Ad-hoc climate support/opposition and lobbying
# 6. Firm-level political risk
# 7. Sector carbon emissions intensity
# 8. Interlock-weighted measures
# 9. Generate analysis dataframes

# 1. Compustat -------------------------------------
# The Compustat database is propriety so these data cannot be included in the replication dataset.
# Refer to readme.txt to find directions for downloading the Compustat data used in this analysis.

# Load Compustat's "North America Annual Fundamentals" dataset from 1987-01 to 2021-07.
comp <- read_csv("data/compustat_1987_2021.csv")
compustat <- comp
names(compustat)[c(3,8)] <- c("year", "ticker")

# Remove firm-years missing tickers or years and firms located outside the USA
compustat <- compustat %>% filter(is.na(ticker)==F & is.na(year) == F & loc=="USA")

## Deal with repeats
compustat <- compustat %>%
  group_by(year, ticker) %>%
  # Mark repeats
  mutate(dups = if_else(n()>1,1,0)) %>%
  ungroup() %>%
  # Drop the row with the fewest missing columns
  mutate(n_na = apply(apply(.,1,is.na),2,sum)) %>%
  group_by(year, ticker) %>%
  mutate(drop = if_else(n_na != min(n_na),1,0)) %>%
  filter(drop == 0) %>%
  # keep first entry if tied number of missing columns
  mutate(row_number = row_number()) %>%
  mutate(drop = if_else(row_number() != min(row_number()),1,0)) %>%
  filter(drop == 0) %>%
  distinct(.keep_all = F) %>%
  select(-n_na,-drop,-row_number) %>%
  ungroup()

compustat$gvkey <- as.numeric(compustat$gvkey)

compustat$name <- compustat$conml

# Fix Baker Hughes
compustat$gvkey[which(compustat$name=="Baker Hughes Inc")] <- unique(compustat$gvkey[which(compustat$name=="Baker Hughes a GE Co")])
compustat$name[which(compustat$name=="Baker Hughes Inc")] <- unique(compustat$name[which(compustat$name=="Baker Hughes a GE Co")])
compustat <- compustat[-which(compustat$name=="Baker Hughes a GE Co" & is.na(compustat$revt)),]

# Fix Goodman Global
compustat$gvkey[which(compustat$name=="Goodman Global Inc")] <- unique(compustat$gvkey[which(compustat$name=="Goodman Global Group Inc")])
compustat$name[which(compustat$name=="Goodman Global Inc")] <- unique(compustat$name[which(compustat$name=="Goodman Global Group Inc")])

# Fix Compustat names
compustat <-  compustat %>%
  mutate(name = str_remove_all(tolower(name),pattern = "[,]|^the | [(]the[)]$|[.]")) %>%
  mutate(name = str_replace_all(name,pattern = "[']s ",replacement = "s ")) %>%
  mutate(name = str_remove_all(name, pattern = "cl [a-z]$|[-]old$"))

# Save working files
save(compustat,file = "working_files/compustat_temp")
load("working_files/compustat_temp")

## For replication: Generate csvs with gvkey / datadate / tic for raw Compustat data
write_csv(comp %>% select(gvkey, datadate, tic),file = "data/replication_identifiers/compustat_1987_2021_identifiers.csv")

# 2. Interlocks -------------------------------------
# The BoardEx database is propriety so these data cannot be included in the replication dataset.
# Refer to readme.txt to find directions for downloading the BoardEx data used in this analysis.
# Load BoardEx "North America Organization - Composition of Officers, Directors and Senior Managers" dataset for 1933 to 2021.
managers <- read_csv("data/boardex_managers_1933_2021.csv")

# Load BoardEx "North America Company Profiles" dataset for 1933 to 2021
companies <- read_csv("data/boardex_companies_1933_2021.csv") %>% rename(CompanyID = BoardID, CompanyName_profile = BoardName)
companies$CompanyName_profile <- str_remove_all(companies$CompanyName_profile,pattern = "\032")

# Keep only profiles with manager information
companies <- companies %>% filter(CompanyID %in% managers$CompanyID)

## 2.1 Subset BoardEx data to public firms in Compustat dataset -------------------------------------
### 2.1.1 Identify BoardEx and Compustat matches -------------------------------------
# Using WRDS BoardEx CRSP Compustat Link, CIK code, and ticker

## Use WRDS BoardEx CRSP Compustat Link
# Load WRDS link data, keep only distinct rows of Compustat gvkey and BoardEx ID
wrds_link <- read_csv("data/boardex_compustat_link.csv") %>% 
  rename(CompanyID = companyid, gvkey = GVKEY) %>%
  mutate(gvkey = as.numeric(gvkey)) %>%
  select(gvkey, CompanyID,score) %>%
  distinct()

# Keep only WRDS link data with gvkeys in Compustat
wrds_link <- wrds_link %>%
  filter(gvkey %in% compustat$gvkey)

## There are multiple Compustat gvkeys that match the same BoardEx Company ID
# If multiple BoardEx matches, keep the match with the highest score
wrds_link <- wrds_link %>%
  group_by(CompanyID) %>%
  mutate(drop = if_else(score!=min(score),1,0)) %>%
  filter(drop == 0) %>%
  select(-drop) %>%
  ungroup()

## If multiple BoardEx matches tied on the highest score and gvkey is in compustat, match on Ticker. If not ticker match, use fuzzy matching
wrds_link_ties <- wrds_link %>%
  group_by(CompanyID) %>%
  filter(n()>1)

wrds_link_ties_gvkeys <- sapply(unique(wrds_link_ties$CompanyID),function(id){
  wrds_link_entry <- wrds_link_ties[which(wrds_link_ties$CompanyID==id),]
  boardex_entry <- companies[which(companies$CompanyID==id),]
  compustat_entry <- compustat[which(compustat$gvkey %in% wrds_link_entry$gvkey),] %>% select(gvkey, name, ticker) %>% distinct()
  if(any(is.na(boardex_entry$Ticker)==F) & any(unique(compustat_entry$ticker) %in% boardex_entry$Ticker)){
    return(unique(compustat_entry$gvkey[which(compustat_entry$ticker %in% boardex_entry$Ticker)]))
  } else {
    # Remove BoardEx "prior to" and "De-listed" information
    names_boardex <- boardex_entry$CompanyName_profile
    names_boardex_vec <- str_remove_all(names_boardex,pattern = "[)]|prior to|De-listed|\\d{4}|\\d{2}[//]")
    names_boardex_vec <- str_remove_all(names_boardex_vec,pattern = " {1,}$")
    names_boardex_vec <- unlist(str_split(names_boardex_vec,pattern = " [(]",simplify = T))
    names_boardex_vec <- names_boardex_vec[which(names_boardex_vec!="")]
    
    if(length(names_boardex_vec)==1){
      names_boardex_vec <- rep(names_boardex_vec,2)
    }
    
    match_out <- name_match(names_boardex_vec, unique(compustat_entry$name))
    
    # Keep highest confidence match if above 0.50 confidence
    if(any(match_out$match_1_confidence>0.50)){
      return(unique(compustat_entry$gvkey[which(compustat_entry$name==unique(match_out$match_1[which(match_out$match_1_confidence==max(match_out$match_1_confidence))])[1])]))
    } else {
      return(NA_integer_)
    }
  }
})
names(wrds_link_ties_gvkeys) <- unique(wrds_link_ties$CompanyID)

# For ties that genuinely match to multiple gvkeys, combine Compustat gvkey entries
if(any(lengths(wrds_link_ties_gvkeys)>1)){
  for(id in as.numeric(names(wrds_link_ties_gvkeys[which(lengths(wrds_link_ties_gvkeys)>1)]))){
    compustat <- bind_rows(compustat %>% filter(gvkey %in% wrds_link_ties$gvkey[which(wrds_link_ties$CompanyID==id)] == F),
                           compustat %>%
                             filter(gvkey %in% wrds_link_ties$gvkey[which(wrds_link_ties$CompanyID==id)]) %>%
                             group_by(year) %>%
                             # Mark repeats
                             mutate(dups = if_else(n()>1,1,0)) %>%
                             ungroup() %>%
                             # Drop the row with the fewest missing columns
                             mutate(n_na = apply(apply(.,1,is.na),2,sum)) %>%
                             group_by(year) %>%
                             mutate(drop = if_else(n_na != min(n_na),1,0)) %>%
                             filter(drop == 0) %>%
                             # keep first entry if tied number of missing columns
                             mutate(row_number = row_number()) %>%
                             mutate(drop = if_else(row_number() != min(row_number()),1,0)) %>%
                             filter(drop == 0) %>%
                             distinct(.keep_all = F) %>%
                             select(-n_na,-drop,-row_number) %>%
                             ungroup() %>%
                             # regularize gvkey
                             mutate(gvkey = as.numeric(names(table(gvkey))[which(table(gvkey)==max(table(gvkey)))[1]]))
                             )
  }
}

wrds_link_ties_gvkeys <- unlist(wrds_link_ties_gvkeys)[which(unlist(wrds_link_ties_gvkeys) %in% compustat$gvkey)]

# Add back into WRDS link dataframe
wrds_link <- bind_rows(wrds_link %>%
                         group_by(CompanyID) %>%
                         filter(n()==1),
                       wrds_link_ties[which(wrds_link_ties$gvkey %in% wrds_link_ties_gvkeys),])

## There are multiple BoardEx CompanyIDs that match the same Compustat gvkey
# If multiple BoardEx matches, keep the match with the highest score
wrds_link <- wrds_link %>%
  group_by(gvkey) %>%
  mutate(drop = if_else(score!=min(score),1,0)) %>%
  filter(drop == 0) %>%
  select(-drop) %>%
  ungroup()

## If multiple BoardEx matches tied on the highest score and gvkey is in compustat, match on Ticker. If not ticker match, use fuzzy matching
wrds_link_ties <- wrds_link %>%
  group_by(gvkey) %>%
  filter(n()>1)

wrds_link_ties_companyid <- sapply(unique(wrds_link_ties$gvkey),function(gvkey){
  wrds_link_entry <- wrds_link_ties[which(wrds_link_ties$gvkey==gvkey),]
  boardex_entry <- companies[which(companies$CompanyID %in% wrds_link_entry$CompanyID),]
  compustat_entry <- compustat[which(compustat$gvkey==gvkey),] %>% select(gvkey, name, ticker) %>% distinct()
  if(any(is.na(boardex_entry$Ticker)==F) & any(unique(compustat_entry$ticker) %in% boardex_entry$Ticker)){
    return(unique(boardex_entry$CompanyID[which(boardex_entry$Ticker %in% compustat_entry$ticker)]))
  } else {
    # Remove BoardEx "prior to" and "De-listed" information
    names_boardex <- boardex_entry$CompanyName_profile
    names_boardex_vec <- str_remove_all(names_boardex,pattern = "[)]|prior to|De-listed|\\d{4}|\\d{2}[//]")
    names_boardex_vec <- str_remove_all(names_boardex_vec,pattern = " {1,}$")
    names_boardex_vec <- unlist(str_split(names_boardex_vec,pattern = " [(]",simplify = T))
    names_boardex_vec <- names_boardex_vec[which(names_boardex_vec!="")]
    
    names_compustat_vec <- unique(compustat_entry$name)
    if(length(names_compustat_vec)==1){
      names_compustat_vec <- rep(names_compustat_vec,2)
    }
    
    match_out <- name_match(names_compustat_vec,names_boardex_vec)
    
    # Keep highest confidence match if above 0.50 confidence
    if(any(match_out$match_1_confidence>0.50)){
      out <- unique(boardex_entry$CompanyID[which(names_boardex_vec==unique(match_out$match_1[which(match_out$match_1_confidence==max(match_out$match_1_confidence))])[1])])
      if(match_out$match_2_confidence == 1){
        out <- c(out, unique(boardex_entry$CompanyID[which(names_boardex_vec==unique(match_out$match_2[which(match_out$match_2_confidence==max(match_out$match_1_confidence))])[1])]))
      }
      return(out)
    } else {
      return(NA_integer_)
    }
  }
})
names(wrds_link_ties_companyid) <- unique(wrds_link_ties$gvkey)

# For ties that genuinely match to multiple CompanyIDs, combine BoardEx CompanyID entries
if(any(lengths(wrds_link_ties_companyid)>1)){
  for(id in as.numeric(names(wrds_link_ties_companyid[which(lengths(wrds_link_ties_companyid)>1)]))){
    
    managers <- bind_rows(managers %>% filter(CompanyID %in% wrds_link_ties$CompanyID[which(wrds_link_ties$gvkey==id)] == F),
                          managers %>%
                            filter(CompanyID %in% wrds_link_ties$CompanyID[which(wrds_link_ties$gvkey==id)]) %>%
                            # Regularize company ID
                            mutate(CompanyID = as.numeric(names(table(CompanyID))[which(table(CompanyID)==max(table(CompanyID)))[1]])) %>%
                            # Remove any potential duplicates
                            group_by(DirectorID, RoleName, DateStartRole, DateEndRole, Seniority) %>%
                            filter(n()==1))
    
    companies <- companies %>% filter(CompanyID %in% managers$CompanyID)
  }
}

wrds_link_ties_companyid <- unlist(wrds_link_ties_companyid)[which(unlist(wrds_link_ties_companyid) %in% managers$CompanyID)]

# Add back into WRDS link dataframe
wrds_link <- bind_rows(wrds_link %>%
                         group_by(gvkey) %>%
                         filter(n()==1),
                       wrds_link_ties[which(wrds_link_ties$CompanyID %in% wrds_link_ties_companyid),])

# Filter managers by WRDS link
wrds_link_filtered <- wrds_link %>% filter(gvkey %in% compustat$gvkey & CompanyID %in% managers$CompanyID) 

managers_linked <- left_join(wrds_link_filtered,
                             managers) %>% distinct()

# Identify firms that match CIK code and ticker but are not linked by WRDS (N = 45)
companies_cik_ticker <- left_join(companies %>% filter(CompanyID %in% managers$CompanyID & CompanyID %in% managers_linked$CompanyID == F) %>% mutate(CIKCode = as.numeric(CIKCode)) %>% rename(ticker = Ticker),
                                  compustat %>% select(cik, ticker, gvkey) %>% filter(gvkey %in% managers_linked$gvkey == F) %>% distinct() %>% rename(CIKCode = cik) %>% mutate(CIKCode = as.numeric(CIKCode))) %>%
  na.omit()
managers_cik_ticker <- left_join(companies_cik_ticker, managers)

# Identify firms that match CIK code but not ticker and are not linked by WRDS (N = 934)
companies_cik <- left_join(companies %>% filter(CompanyID %in% managers$CompanyID & CompanyID %in% managers_linked$CompanyID == F & CompanyID %in% companies_cik_ticker$CompanyID == F) %>% mutate(CIKCode = as.numeric(CIKCode)) %>% select(-Ticker) %>% na.omit(),
                           compustat %>% select(cik, gvkey) %>% filter(gvkey %in% managers_linked$gvkey == F) %>% distinct() %>% rename(CIKCode = cik) %>% mutate(CIKCode = as.numeric(CIKCode)) %>% na.omit()) %>%
  na.omit()

managers_cik <- left_join(companies_cik,
                          managers)

# Identify firms that match ticker but not CIK code and are not linked by WRDS (N = 138; but only 31 are retained as good matches)
companies_ticker <- left_join(companies %>% filter(CompanyID %in% managers$CompanyID & CompanyID %in% managers_linked$CompanyID == F & CompanyID %in% companies_cik_ticker$CompanyID == F & CompanyID %in% companies_cik$CompanyID == F)  %>% rename(ticker = Ticker) %>% na.omit(),
                           compustat %>% select(ticker, gvkey) %>% filter(gvkey %in% managers_linked$gvkey == F) %>% distinct() %>% na.omit()) %>%
  na.omit()
managers_ticker <- left_join(companies_ticker,
                          managers)

### 2.1.2 Confirm ticker matches via fuzzy matching -------------------------------------
## Conduct fuzzy matching for firms that only match on ticker
# Prepare BoardEx names
names_boardex <- sort(unique(managers_ticker$CompanyName))

# Remove BoardEx "prior to" and "De-listed" information
names_boardex_vec <- str_remove_all(names_boardex,pattern = "[)]|prior to|De-listed|\\d{4}|\\d{2}[//]")
names_boardex_vec <- str_remove_all(names_boardex_vec,pattern = " {1,}$")
names_boardex_vec <- unlist(str_split(names_boardex_vec,pattern = " [(]",simplify = T))

names_boardex <- bind_cols(names_boardex, names_boardex_vec) %>%
  rename(full_name = 1, name_1 = 2, name_2 = 3, name_3 = 4, name_4 = 5) %>%
  pivot_longer(cols = 2:5, names_to = "name_no",values_to = "name_char") %>%
  filter(name_char != "")

# Prepare Compustat names
names_compustat <- sort(unique(compustat$name[which(compustat$ticker %in% companies$Ticker[which(companies$CompanyID %in% managers_ticker$CompanyID)])]))

ticker_matches <- name_match(names_boardex$name_char, names_compustat)

# Add boardex full names
ticker_matches <- left_join(names_boardex %>% select(full_name, name_char) %>% distinct(),
                            ticker_matches %>%
                              rename(name_char = name_to_match))

# If the same compustat name matches to multiple CDP companies, keep only the one with the highest match
ticker_multiple <- ticker_matches %>%
  group_by(match_1) %>%
  filter(length(unique(full_name))>1)

ticker_multiple <- ticker_multiple %>%
  group_by(match_1) %>%
  mutate(match_1 = if_else(match_1_confidence == max(match_1_confidence),match_1, "-99"))

ticker_matches <- bind_rows(ticker_matches %>%
                           group_by(match_1) %>%
                           filter(length(unique(full_name))==1),
                         ticker_multiple)

ticker_matches$match_1_confidence[which(ticker_matches$match_1=="-99")] <- -99

# For companies with multiple matches, keep match with the highest confidence
ticker_matches <- ticker_matches %>%
  group_by(full_name) %>%
  mutate(match_1 = match_1[which(match_1_confidence==max(match_1_confidence))[1]],
         match_1_confidence = match_1_confidence[which(match_1_confidence==max(match_1_confidence))[1]])

# If no confidence, mark as bad
ticker_matches <- ticker_matches %>%
  group_by(full_name) %>%
  mutate(good = if_else(max(match_1_confidence)==-99,0,1))

## Retain good matches
# Indicate first word match
ticker_matches$first_word <- ifelse(str_split(tolower(ticker_matches$name_char),pattern = " ",simplify = T)[,1]==str_split(tolower(ticker_matches$match_1),pattern = " ",simplify = T)[,1],1,0)

# If confidence above 0.25 and first word match, indicate as good
ticker_matches <- ticker_matches %>%
  mutate(good = if_else(good==0,
                        0,
                        if_else(match_1_confidence>0.25 & first_word==1,1,0)))

ticker_matches <- ticker_matches %>%
  filter(good==1) %>%
  group_by(full_name) %>%
  mutate(n=row_number()) %>%
  filter(n==1)

managers_ticker <- managers_ticker[which(managers_ticker$CompanyName %in% ticker_matches$full_name),]

### 2.1.3 Combine BoardEx manager data across Compustat matching criteria -------------------------------------
managers_compustat <- bind_rows(managers_linked, managers_cik_ticker)
managers_compustat <- bind_rows(managers_compustat, managers_cik)
managers_compustat <- bind_rows(managers_compustat, managers_ticker %>% mutate(CIKCode = as.numeric(CIKCode)))

# Fix mistake
# September 20, 2016 is approximate start date for Jon Chorley as Chief Sustainability Officer 
# https://web.archive.org/web/20160920154053/https://www.oracle.com/corporate/executives/index.html
managers_compustat$DateStartRole[which(managers_compustat$DirectorName=="Jon Chorley")] <- "20160920"

# Format date start and end
managers_compustat$DateStartRole <- as_date(managers_compustat$DateStartRole,format = "%Y%m%d")
managers_compustat$DateEndRole[which(managers_compustat$DateEndRole=="C")] <- "20210101"
managers_compustat$DateEndRole <- as_date(managers_compustat$DateEndRole,format = "%Y%m%d")

# Save working files
save(managers_compustat,file = "working_files/managers_compustat_temp")
load("working_files/managers_compustat_temp")

## 2.2 Create dataframe of firm interlocks -------------------------------------

### 2.2.1 For directors who served on multiple boards, create dataframe with one row per director-firm-year (as of Janaury 1 of the year) -------------------------------------
yr_dates <- as_date(paste0(1933:2021,"1231"),format = "%Y%m%d")

cl <- makeForkCluster(detectCores()-1)
clusterExport(cl=cl,"yr_dates")
clusterEvalQ(cl=cl,list(library(tidyverse),library(lubridate)))

manager_yr <- pbapply(cl=cl,managers_compustat,1,function(x){
  # If both present, take all years for which tenure contains January 1
  if(is.na(x['DateStartRole']) == F & is.na(x['DateEndRole']) == F){
    # Identify years containing January 1
    yrs <- yr_dates[which(yr_dates %in% as_date(x['DateStartRole']):as_date(x['DateEndRole']))]
    
    # Create dataframe with one row per director-firm-year
    temp <- as_tibble(t(x)) %>% 
      slice(rep(1,length(yrs))) %>%
      select(-DateStartRole, -DateEndRole) %>%
      mutate(year = year(yrs))
    
    return(temp)
    
    # If missing DateStartRole, only take year of DateEndRole
  } else if (is.na(x['DateStartRole']) & is.na(x['DateEndRole']) == F){
    # Identify year for DateEndRole
    yrs <- as_date(x['DateEndRole'])
    
    # Create dataframe with one row per director-firm-year
    temp <- as_tibble(t(x)) %>% 
      slice(rep(1,length(yrs))) %>%
      select(-DateStartRole, -DateEndRole) %>%
      mutate(year = year(yrs))
    
    return(temp)
    
    # If missing DateEndRole, only take year of DateStartRole
  } else if (is.na(x['DateStartRole']) == F & is.na(x['DateEndRole'])){
    # Identify year for DateStartRole
    yrs <- as_date(x['DateStartRole'])
    
    # Create dataframe with one row per director-firm-year
    temp <- as_tibble(t(x)) %>% 
      slice(rep(1,length(yrs))) %>%
      select(-DateStartRole, -DateEndRole) %>%
      mutate(year = year(yrs))
    
    return(temp)
    # If missing both, omit
  } else {
    NA
  }
})
stopCluster(cl)

# Combine rows and format columns
managers_df <- do.call('rbind',manager_yr)
managers_df$gvkey <- as.numeric(managers_df$gvkey)
managers_df$DirectorID <- as.numeric(managers_df$DirectorID)

managers_df <- managers_df %>% filter(is.na(gvkey)==F)

# Save working files
save(managers_df,file = "working_files/managers_df_temp")
load("working_files/managers_df_temp")

# Retain only director-firm-years in which a director is a board member for at least one firm
directors_df <- managers_df %>%
  group_by(DirectorID, year) %>%
  mutate(todrop = ifelse(any(c("Executive Director","Supervisory Director") %in% Seniority) == F,1,0)) %>%
  filter(todrop == 0) %>%
  select(-todrop) %>%
  ungroup()

# Keep only gvkey, DirectorID, and year (remove duplicate rows, e.g., when same director held multiple roles)
directors_df_id_gvkey <- directors_df %>% 
  select(gvkey, DirectorID, year) %>% 
  rename(directorid = DirectorID) %>%
  distinct()

# Keep only directorships that persist in the follow year
directors_df_id_gvkey <- directors_df_id_gvkey %>%
  group_by(gvkey, directorid) %>%
  mutate(drop = if_else((year+1) %in% year == F, 1,0)) %>%
  filter(drop==0) %>%
  select(-drop)

### 2.2.2 Create long dataframe with year-source-receiver-n_interlocks -------------------------------------
cl <- makeForkCluster(detectCores()-1)
clusterExport(cl=cl,"directors_df_id_gvkey")
clusterEvalQ(cl=cl,list(library(tidyverse)))

interlocks_list <- pblapply(cl=cl,as.character(sort(unique(directors_df_id_gvkey$year))), function(yr){
  # For each year, pivot to create dataframe with one row per director and one column for each firm  
  temp <- directors_df_id_gvkey %>%
    filter(year == as.numeric(yr)) %>%
    mutate(exists = 1) %>%
    pivot_wider(id_cols = c(directorid, year),names_from = gvkey, values_from = exists, values_fill = 0)
  
  if(ncol(temp)>3){
    # Convert to sparse matrix
    temp <- Matrix(as.matrix(temp[,-c(1:2)]))
    
    # Calculate number of interlocks
    temp_cp <- crossprod(temp, temp)
    diag(temp_cp) <- 0
    
    # Convert back to tibble
    temp_cp <- bind_cols(tibble(source = rownames(temp_cp)),as_tibble(as.matrix(temp_cp)))
    
    # Pivot longer
    out <- temp_cp %>%
      pivot_longer(cols = -1,names_to = "receiver",values_to = "n_interlocks") %>%
      filter(source!=receiver & n_interlocks != 0) %>%
      mutate(year = yr) %>%
      select(year,source,receiver,n_interlocks)
    
    return(out)
  }
})
stopCluster(cl)

interlocks_df <- ldply(interlocks_list)

interlocks_df$source <- as.numeric(interlocks_df$source)
interlocks_df$receiver <- as.numeric(interlocks_df$receiver)

# Save working files
save(interlocks_df,file = "working_files/interlocks_df_temp")
load("working_files/interlocks_df_temp")

## For replication: Generate csv with BoardEx Company IDs from the managers and companies datasets
write_csv(managers %>% select(CompanyID, DirectorID), file = "data/replication_identifiers/boardex_managers_1933_2021_identifiers.csv")
write_csv(companies %>% select(CompanyID), file = "data/replication_identifiers/boardex_companies_1933_2021_identifiers.csv")

# Clean up
rm(list=ls()[-which(ls() %in% c("curcomp","compustat","managers_df","interlocks_df","dat"))])

# Create analysis dataframe
dat <- compustat %>%
  select(gvkey, conml, name, year, cik, cusip, naics, revt, emp)

# 3. Sustainability officers -------------------------------------
## 3.1 Identify CSO positions -------------------------------------
# Keywords for identifying CSO positions (adding "csr" and "climate change" from Fu, Tang, and Wang 2020)
cso_keywords <- c("sustainability", "sustainable", "responsibility", "ethics", "environment", "csr","climate change")
# Stem CSO keywords using Porter stemmer
cso_keywords <- unique(stemDocument(cso_keywords))
# Create regex searches
cso_search <- paste0(c(paste0("^",cso_keywords),paste0(" ",cso_keywords),paste0("[//]",cso_keywords)),collapse = "|")

# Perform regex search on position titles
managers_df$cso <- 0
managers_df$cso[grep(x = managers_df$RoleName,pattern = cso_search,ignore.case = T)] <- 1

## 3.2 Create firm-year dataframe -------------------------------------
cso_df <- managers_df %>%
  group_by(gvkey, year) %>%
  summarize(cso_exists = max(cso),.groups = "drop")

# Only retain firm-years for cso_df from 2004 (when Linda Fisher became the first Corporate Sustainability Officer)
cso_df <- cso_df %>%
  filter(year >= 2004)

# Add to analysis dataframe
dat <- left_join(dat,
                 cso_df) %>%
  mutate(cso_exists = if_else(is.na(cso_exists),0,cso_exists))

# Clean up
rm(list=ls()[-which(ls() %in% c("curcomp","compustat","managers_df","interlocks_df","cso_df","dat"))])

# 4. CDP reporting -------------------------------------
## 4.1 Prepare CDP data-------------------------------------
# Load data for 2010-2019
cdp <- read_excel("data/Investor CDP 2010_Public Data.xlsx",sheet = "Summary")
cdp <- cdp %>% arrange(Organisation) %>% mutate(cdp_name = Organisation, cdp_year = 2010, cdp_account_no = `Account No.`) %>%
  filter(Country == "USA") %>%
  select(cdp_name, cdp_account_no, cdp_year)

file_names <- list.files("data/")
file_names <- file_names[str_detect(file_names,pattern = "^Investor")]
for(yr in 2011:2019){
  f_name <- paste0("data/",file_names[grep(file_names,pattern = yr)])
  cdp_temp <- tryCatch(read_excel(f_name,sheet = "Summary"),
                       error = function(e){read_excel(f_name,sheet = "Summary Data")})
  
  cdp_temp <- tryCatch(cdp_temp %>% filter(Country=="USA"|Country=="United States of America"),
                       error = function(e){cdp_temp %>% filter(incorporated_country=="USA")})
  
  if("account_name" %in% colnames(cdp_temp)){
    cdp_temp <- cdp_temp %>% arrange(account_name) %>% select(account_name, account_id) %>% 
      rename(cdp_name = account_name, cdp_account_no = account_id) %>% mutate(cdp_year = yr, cdp_account_no = as.character(cdp_account_no))
  } else {
    if("Account No" %in% colnames(cdp_temp)){
      cdp_temp <- cdp_temp %>% rename(`Account No.` = `Account No`)
    } else if ("Account number" %in% colnames(cdp_temp)){
      cdp_temp <- cdp_temp %>% rename(`Account No.` = `Account number`)
    }
    if("Organization" %in% colnames(cdp_temp)){
      cdp_temp <- cdp_temp %>% rename(Organisation = Organization)
    }
    cdp_temp <- cdp_temp %>% arrange(Organisation) %>% mutate(cdp_year = yr, cdp_account_no = as.character(`Account No.`)) %>%
      select(Organisation, cdp_account_no, cdp_year) %>% rename(cdp_name = Organisation)
  }
  cdp <- bind_rows(cdp, cdp_temp)
}

# Load data for 2006-2009
for(yr in 2006:2009){
  cdp_temp <- read_excel("data/cdp_2006_2009_publicly_responding_companies.xlsx",sheet = as.character(yr))
  colnames(cdp_temp) <- c("cdp_account_no","cdp_name")
  cdp_temp$cdp_year <- yr
  cdp <- bind_rows(cdp, cdp_temp %>% mutate(cdp_account_no = as.character(cdp_account_no)))
}

cdp <- cdp %>% 
  arrange(cdp_name, cdp_account_no, cdp_year)

# Clean CDP corporation names
cdp$cdp_name <- str_split(cdp$cdp_name,pattern = "[(][F|f]ormerly",simplify = T)[,1]
cdp$cdp_name <- str_remove_all(cdp$cdp_name,pattern = "(|,| {0,}|[*])$")
cdp$cdp_name <- stri_trans_general(cdp$cdp_name,"Latin-ASCII")

cdp <-  cdp %>%
  mutate(cdp_name = str_remove_all(tolower(cdp_name),pattern = "[,]|^the | [(]the[)]$|[.]")) %>%
  mutate(cdp_name = str_replace_all(cdp_name,pattern = "[']s ",replacement = "s "))

# Fix mistake where CDP reuses same account number for two different companies
cdp$cdp_account_no[which(cdp$cdp_name=="altaba inc")] <- "-99"
cdp$cdp_account_no[which(cdp$cdp_name=="avantor")] <- "-98"

# Make adjustments to CDP firm names to facilitate matching
cdp$cdp_name[which(cdp$cdp_name=="h&r block inc")] <- "block h&r inc"
cdp$cdp_name[which(cdp$cdp_name=="a schulman inc")] <- "schulman (a) inc"
cdp$cdp_name[which(cdp$cdp_name=="banco santander")] <- "santander holdings usa inc"
cdp$cdp_name[which(cdp$cdp_name=="polo ralph lauren corporation")] <- "ralph lauren corp"
cdp$cdp_name[which(cdp$cdp_name=="salesforcecom inc")] <- "salesforce inc"
cdp$cdp_name[which(cdp$cdp_name=="colgate palmolive company")] <- "colgate-palmolive co"
cdp$cdp_name[which(cdp$cdp_name=="air liquide")] <- "liquid air corp"
cdp$cdp_name[which(cdp$cdp_name=="ei du pont de nemours and company")] <- "e i du pont de nemours and co"
cdp$cdp_name[which(cdp$cdp_name=="filtrona")] <- "american filtrona corp"
cdp$cdp_name[which(cdp$cdp_name=="jm smucker company")] <- "j m smucker company"
cdp$cdp_name[which(cdp$cdp_name=="wal-mart stores inc")] <- "walmart inc"
cdp$cdp_name[which(cdp$cdp_name=="yum! brands inc")] <- "yum brands inc"
cdp$cdp_name[which(cdp$cdp_name=="apache corporation")] <- "apa corporation"
cdp$cdp_name[which(cdp$cdp_name=="william wrigley jr company")] <- "wm wrigley jr co"
cdp$cdp_name[which(cdp$cdp_name=="lowes companies inc")] <- "lowes cos inc"
cdp$cdp_name[which(cdp$cdp_name=="jcpenney")] <- "penney (j c) funding corp"
cdp$cdp_name[which(cdp$cdp_name=="dell")] <- "dell technologies inc"
cdp$cdp_name[which(cdp$cdp_name=="american axle & mfg holdings inc")] <- "american axle & manufacturing holdings inc"
cdp$cdp_name[which(cdp$cdp_name=="molson coors brewing company")] <- "molson coors beverage company"
cdp$cdp_name[which(cdp$cdp_name=="molex incorporated")] <- "molex inc"
cdp$cdp_name[which(cdp$cdp_name=="pall corporation")] <- "pall corp"
cdp$cdp_name[which(cdp$cdp_name=="ball corporation")] <- "ball corp"
cdp$cdp_name[which(cdp$cdp_name=="southern company")] <- "southern co"
cdp$cdp_name[which(cdp$cdp_name=="tennant company")] <- "tennant co"
cdp$cdp_name[which(cdp$cdp_name=="royal caribbean cruises ltd")] <- "royal caribbean group"
cdp$cdp_name[which(cdp$cdp_name=="first horizon national corporation")] <- "first horizon corp"
cdp$cdp_name[which(cdp$cdp_name=="enbridge inc")] <- "enbridge energy partners lp"
cdp$cdp_name[which(cdp$cdp_name=="bemis company")] <- "bemis co inc"
cdp$cdp_name[which(cdp$cdp_name=="corning incorporated")] <- "corning inc"
cdp$cdp_name[which(cdp$cdp_name=="intel corporation")] <- "intel corp"
cdp$cdp_name[which(cdp$cdp_name=="windstream corporation")] <- "windstream holdings inc"
cdp$cdp_name[which(cdp$cdp_name=="nasdaq omx group inc")] <- "nasdaq inc"
cdp$cdp_name[which(cdp$cdp_name=="bny mellon")] <- "bank of new york mellon corp"
cdp$cdp_name[which(cdp$cdp_name=="williams companies inc")] <- "williams cos inc"
cdp$cdp_name[which(cdp$cdp_name=="crane co")] <- "crane holdings co"
cdp$cdp_name[which(cdp$cdp_name=="sungard")] <- "sungard data systems inc"
cdp$cdp_name[which(cdp$cdp_name=="bbva")] <- "bbva compass bancshares inc"
cdp$cdp_name[which(cdp$cdp_name=="casella")] <- "casella waste systems inc"
cdp$cdp_name[which(cdp$cdp_name=="prudential plc")] <- "prudential bancorp inc"
cdp$cdp_name[which(cdp$cdp_name=="aes corporation")] <- "aes corp"
cdp$cdp_name[which(cdp$cdp_name=="itt corporation")] <- "itt inc"
cdp$cdp_name[which(cdp$cdp_name=="xerox")] <- "xerox holdings corp"
cdp$cdp_name[which(cdp$cdp_name=="charles schwab corporation")] <- "schwab (charles) corp"
cdp$cdp_name[which(cdp$cdp_name=="archer daniels midland")] <- "archer-daniels-midland co"
cdp$cdp_name[which(cdp$cdp_name=="anheuser busch inbev")] <- "anheuser busch inbev"

# Make account numbers uniform within companies
cdp <- cdp %>%
  group_by(cdp_name) %>%
  mutate(cdp_account_no = unique(cdp_account_no)[1]) %>%
  distinct()

## 4.2 Match CDP data to Compustat data -------------------------------------
## Conduct fuzzy matching for CDP firm names
# Prepare BoardEx names
names_cdp <- sort(unique(cdp$cdp_name))

# Prepare Compustat names
# Keep only compustat companies that exist during CDP observation years (2006-2019)
compustat_cdp_match <- compustat %>%
  filter(gvkey %in% compustat$gvkey[which(compustat$year %in% 2006:2009)])

names_compustat <- sort(unique(compustat_cdp_match$name))

cdp_matches <- name_match(names_cdp, names_compustat)

# Add cdp account number
cdp_matches <- left_join(cdp %>% select(cdp_account_no, cdp_name) %>% distinct(),
                         cdp_matches %>%
                           rename(cdp_name = name_to_match))

# If the same compustat name matches to multiple CDP companies, keep only the one with the highest match
cdp_multiple <- cdp_matches %>%
  group_by(match_1) %>%
  filter(length(unique(cdp_account_no))>1)

cdp_multiple <- cdp_multiple %>%
  group_by(match_1) %>%
  mutate(match_1 = if_else(match_1_confidence == max(match_1_confidence),match_1, "-99"))

cdp_matches <- bind_rows(cdp_matches %>%
                           group_by(match_1) %>%
                           filter(length(unique(cdp_account_no))==1),
                         cdp_multiple)

cdp_matches$match_1_confidence[which(cdp_matches$match_1=="-99")] <- -99

# For companies with multiple matches, keep match with the highest confidence
cdp_matches <- cdp_matches %>%
  group_by(cdp_account_no) %>%
  mutate(match_1 = match_1[which(match_1_confidence==max(match_1_confidence))[1]],
         match_1_confidence = match_1_confidence[which(match_1_confidence==max(match_1_confidence))[1]])

## Retain good matches
# Indicate first word match
cdp_matches$first_word <- ifelse(str_split(tolower(cdp_matches$cdp_name),pattern = " ",simplify = T)[,1]==str_split(tolower(cdp_matches$match_1),pattern = " ",simplify = T)[,1],1,0)

# If no confidence, mark as bad
cdp_matches <- cdp_matches %>%
  group_by(cdp_account_no) %>%
  mutate(good = if_else(max(match_1_confidence)==-99,0,1))

# If confidence above 0.25 and first word match, indicate as good
cdp_matches <- cdp_matches %>%
  group_by(cdp_account_no) %>%
  mutate(good = if_else(good==0,
                        0,
                        if_else(max(match_1_confidence,na.rm = T)>0.25 & max(first_word,na.rm = T)==1,1,0)))
  
# Fix mistakes
cdp_matches$good[which(cdp_matches$cdp_account_no %in% c(8878, 774, 28600, 19305))] <- 0

## 4.3 Identify firm-years with CDP reporting -------------------------------------
# Merge compustat gvkeys to matching firms
cdp <- left_join(cdp,
          cdp_matches %>%
            filter(good == 1) %>%
            select(cdp_account_no, cdp_name, match_1) %>%
            rename(compustat_name = match_1))

cdp <- left_join(cdp,
                 compustat_cdp_match %>%
                   select(name, gvkey) %>%
                   rename(compustat_name = name) %>% distinct())

## Adjust gvkeys to align boardex and cdp matches
cdp$gvkey[grep(cdp$cdp_name, pattern = "vwr international llc")] <- unique(managers_df$gvkey[grep(tolower(managers_df$CompanyName),pattern = "vwr corp")])
cdp$gvkey[grep(cdp$cdp_name,pattern = "trimble")] <- unique(managers_df$gvkey[grep(tolower(managers_df$CompanyName),pattern = "trimble")])
cdp$gvkey[grep(cdp$cdp_name,pattern = "berry plastics|berry global")] <- unique(managers_df$gvkey[grep(tolower(managers_df$CompanyName),pattern = "berry global group")])
cdp$gvkey[grep(cdp$cdp_name,pattern = "^alcoa")] <- unique(managers_df$gvkey[grep(tolower(managers_df$CompanyName),pattern = "^alcoa")])

cdp_df <- cdp %>%
  ungroup() %>%
  rename(year = cdp_year) %>%
  select(gvkey, year) %>%
  filter(is.na(gvkey) == F) %>%
  mutate(cdp_report = 1)

# Add to analysis dataframe
dat <- left_join(dat,
                 cdp_df) %>%
  mutate(cdp_report = if_else(is.na(cdp_report) & year %in% 2006:2019,0,cdp_report))

## For replication: Generate csv with CDP IDs for 2010-2019
write_csv(cdp %>% select(cdp_name, cdp_year) %>% filter(cdp_year %in% 2010:2019) %>% distinct(),file = "data/replication_identifiers/cdp_2010_2019_identifiers.csv")

# Clean up
rm(list=ls()[-which(ls() %in% c("curcomp","compustat","managers_df","interlocks_df","cso_df","cdp_df","dat"))])

# 5. Ad-hoc climate support/opposition -------------------------------------

all <- read.csv("data/all climate coalitions.csv")
allfirms <- read.csv("data/climate actors basic covariates cleaned with outcomes.csv")
all <- join(all, allfirms[,c("name","listed","ticker","GUOticker","GUObvdid")], type = "left")

coals <- read.csv("data/climate coalitions progress.csv")

## 5.1 Supporting coalitions -------------------------------------
supcoals <- coals$org[coals$position %in% c("weakly favor","strongly favor")]
panelSUP <- all[,c(1:5,98:101)]
yearsdat <- data.frame(matrix(0, nrow = nrow(panelSUP), ncol = 2019-1995+1))
colnames(yearsdat) <- 1995:2019
panelSUP <- cbind(panelSUP, yearsdat)

for(i in 1:length(supcoals)){
  # i <- 1
  print(i)
  curcoal <- supcoals[i]
  cur <- read.csv(paste("data/group_panels/", curcoal, " members.csv", sep = ""), colClasses = "character")
  years <- substr(cur$links, nchar(cur$links)-3, nchar(cur$links)); years <- years[years != ""]
  if(sum((as.numeric(years) %in% c(1985:2020)) == FALSE) > 0){print(paste("Check", curcoal))}
  links <- gsub(" ", "", substr(cur$links, 1, 2)); links <- links[links != ""]
  scols <- grepl("source",names(cur))
  if(sum((colnames(cur)[scols] %in% paste("source", 1:15, sep = "")) == FALSE) > 0){print(paste("Check", "curcoal"))}
  yr_span <- unique(sort(rep(as.numeric(years), each = 4) + rep(c(0,1,2,3), length(years))))
  yr_span <- yr_span[yr_span < 2020]
  cur_yrs <- data.frame(cur[,1:2], matrix(NA, nrow = nrow(cur), ncol = length(yr_span)))
  names(cur_yrs)[3:(2+length(yr_span))] <- yr_span
  for(j in sort(years)){
    # j <- "2013"
    link <- links[years == j]
    yrs_subset <- colnames(cur_yrs)[3:ncol(cur_yrs)][as.numeric(colnames(cur_yrs)[3:ncol(cur_yrs)]) %in% (as.numeric(j) + c(0,1,2,3))]
    cur_yrs[,yrs_subset] <- 0
    cur_yrs[cur[,paste("source",link,sep="")] == link,yrs_subset] <- 1
  }
  ndraws <- sum(scols)
  for(k in cur_yrs$all_name_match){
    # k <- "Van Ness Feldman LLP"
    panelSUP[panelSUP$name == k,as.character(yr_span)] <- panelSUP[panelSUP$name == k,as.character(yr_span)] + cur_yrs[cur_yrs$all_name_match == k,as.character(yr_span)]
  }
  panelSUP[panelSUP$name %in% cur_yrs$all_name_match,]
}

panelSUP2 <- do.call("rbind", replicate(length(1995:2019), panelSUP[,1:9], simplify = FALSE))
panelSUP2$year <- rep(1995:2019, each = nrow(panelSUP))
panelSUP2$numsupcoal <- NA
panelSUP2$numsupcoal <- c(unlist(panelSUP[,10:ncol(panelSUP)]))

## 5.2 Opposing coalitions -------------------------------------

oppcoals <- coals$org[coals$position %in% c("weakly oppose","strongly oppose")]
panelOPP <- all[,c(1:5,98:101)]
yearsdat <- data.frame(matrix(0, nrow = nrow(panelOPP), ncol = 2019-1992+1))
colnames(yearsdat) <- 1992:2019
panelOPP <- cbind(panelOPP, yearsdat)

for(i in 1:length(oppcoals)){
  # i <- 19
  print(i)
  curcoal <- oppcoals[i]
  cur <- read.csv(paste("data/group_panels/", curcoal, " members.csv", sep = ""), colClasses = "character")
  years <- substr(cur$links, nchar(cur$links)-3, nchar(cur$links)); years <- years[years != ""]
  if(sum((as.numeric(years) %in% c(1985:2020)) == FALSE) > 0){print(paste("Check", curcoal))}
  links <- gsub(" ", "", substr(cur$links, 1, 2)); links <- links[links != ""]
  scols <- grepl("source",names(cur))
  if(sum((colnames(cur)[scols] %in% paste("source", 1:15, sep = "")) == FALSE) > 0){print(paste("Check", "curcoal"))}
  yr_span <- unique(sort(rep(as.numeric(years), each = 4) + rep(c(0,1,2,3), length(years))))
  yr_span <- yr_span[yr_span < 2020]
  yr_span <- yr_span[yr_span > 1991]
  cur_yrs <- data.frame(cur[,1:2], matrix(NA, nrow = nrow(cur), ncol = length(yr_span)))
  names(cur_yrs)[3:(2+length(yr_span))] <- yr_span
  for(j in sort(years)){
    # j <- "2013"
    link <- links[years == j]
    yrs_subset <- colnames(cur_yrs)[3:ncol(cur_yrs)][as.numeric(colnames(cur_yrs)[3:ncol(cur_yrs)]) %in% (as.numeric(j) + c(0,1,2,3))]
    cur_yrs[,yrs_subset] <- 0
    cur_yrs[cur[,paste("source",link,sep="")] == link,yrs_subset] <- 1
  }
  ndraws <- sum(scols)
  for(k in cur_yrs$all_name_match){
    # k <- "Van Ness Feldman LLP"
    panelOPP[panelOPP$name == k,as.character(yr_span)] <- panelOPP[panelOPP$name == k,as.character(yr_span)] + cur_yrs[cur_yrs$all_name_match == k,as.character(yr_span)]
  }
  panelOPP[panelOPP$name %in% cur_yrs$all_name_match,]
}

panelOPP2 <- do.call("rbind", replicate(length(1992:2019), panelOPP[,1:9], simplify = FALSE))
panelOPP2$year <- rep(1992:2019, each = nrow(panelOPP))
panelOPP2$numoppcoal <- NA
panelOPP2$numoppcoal <- c(unlist(panelOPP[,10:ncol(panelOPP)]))

# 5.3 Climate lobbying -------------------------------------
panelLOB <- all[,c(1:5,9:11,98:101)]
yearsdat <- data.frame(matrix(0, nrow = nrow(panelSUP), ncol = 2019-1995+1))
colnames(yearsdat) <- 1995:2019
panelLOB <- cbind(panelLOB, yearsdat)
panelLOB[,as.character(1995:2019)] <- 0

lobreps <- read.csv("data/cur_report.csv")
lobgroups <- panelLOB$lob_client[panelLOB$lob_client != ""] 

for(i in lobgroups){
  # i <- "Printing Industries of America"
  cur <- lobreps[lobreps$client == i,]
  lobyears <- unique(cur$year.x)
  panelLOB[panelLOB$lob_client == i,as.character(lobyears)] <- 1
}
panelLOB[,as.character(c(1995:1999,2018:2019))] <- NA

panelLOB2 <- do.call("rbind", replicate(length(1995:2019), panelLOB[,1:12], simplify = FALSE))
panelLOB2$year <- rep(1995:2019, each = nrow(panelLOB))
panelLOB2$lob <- NA
panelLOB2$lob <- c(unlist(panelLOB[,13:ncol(panelLOB)]))

## 5.3 Match support and opposition to Compustat data using CIK codes -------------------------------------
# Merge bvdids with cik crosswalk
bvdid_cik <- read_excel("data/bvdid_cik_crosswalk.xlsx") %>% 
  select(-c(1:2)) %>% 
  rename(bvdid = 1, cik = 2) %>%
  filter(nchar(cik)==10)

## For supporting coalitions
panelSUP3 <- left_join(panelSUP2,
                       bvdid_cik)

# If main bvdid didn't work, use the GUObvdid
panelSUP3 <- bind_rows(panelSUP3 %>% filter(is.na(cik)==F),
                       left_join(panelSUP3 %>% filter(is.na(cik)) %>% select(-cik),
                                 bvdid_cik %>% rename(GUObvdid = bvdid)) %>%
                         filter(is.na(cik)==F))

# Add coalition memberships across firms with subsidiaries in dataset
panelSUP3 <- panelSUP3 %>%
  group_by(cik,year) %>%
  summarize(numsupcoal=sum(numsupcoal),.groups = "drop")

dat <- left_join(dat, 
                 panelSUP3) %>%
  mutate(numsupcoal = if_else(is.na(numsupcoal) & year %in% 1995:2019,0,numsupcoal))

## For opposing coalitions
panelOPP3 <- left_join(panelOPP2,
                       bvdid_cik)

# If main bvdid didn't work, use the GUObvdid
panelOPP3 <- bind_rows(panelOPP3 %>% filter(is.na(cik)==F),
                       left_join(panelOPP3 %>% filter(is.na(cik)) %>% select(-cik),
                                 bvdid_cik %>% rename(GUObvdid = bvdid)) %>%
                         filter(is.na(cik)==F))

# Add coalition memberships across firms with subsidiaries in dataset
panelOPP3 <- panelOPP3 %>%
  group_by(cik,year) %>%
  summarize(numoppcoal=sum(numoppcoal),.groups = "drop")

dat <- left_join(dat, 
                 panelOPP3) %>%
  mutate(numoppcoal = if_else(is.na(numoppcoal) & year %in% 1992:2019,0,numoppcoal))

## For lobbying
panelLOB3 <- left_join(panelLOB2,
                       bvdid_cik)

# If main bvdid didn't work, use the GUObvdid
panelLOB3 <- bind_rows(panelLOB3 %>% filter(is.na(cik)==F),
                       left_join(panelLOB3 %>% filter(is.na(cik)) %>% select(-cik),
                                 bvdid_cik %>% rename(GUObvdid = bvdid)) %>%
                         filter(is.na(cik)==F))

# Add coalition memberships across firms with subsidiaries in dataset
panelLOB3 <- panelLOB3 %>%
  group_by(cik,year) %>%
  summarize(lob=max(lob),.groups = "drop")

dat <- left_join(dat, 
                 panelLOB3) %>%
  mutate(lob = if_else(is.na(lob) & year %in% 2000:2017,0,lob))

# For replication: generate list of bvdids for Orbis-Compustat crosswalk (via CIK Code and Ticker)
bvdid_list = tibble(bvdid = unique(c(panelSUP2$bvdid, panelSUP2$GUObvdid, panelOPP2$bvdid, panelOPP2$GUObvdid, panelLOB2$bvdid,panelLOB2$GUObvdid))) %>% filter(bvdid != "")
write_delim(bvdid_list, file = "data/replication_identifiers/bvdid_identifiers.txt",col_names = F)

# Clean up
rm(list=ls()[-which(ls() %in% c("cdp_df","compustat","cso_df","curcomp","dat","interlocks_df","managers_df","panelSUP3","panelOPP3","panelLOB3"))])

# 6. Firm-level political risk -------------------------------------

# Load risk data
risk <- read.csv("data/flpr_firmquarter_2020q4.csv")
risk$year <- substr(risk$date, 1, 4)

# Calculate annual average risk by firm-year
risk2 <- risk %>%
  group_by(gvkey, year) %>%
  summarize(cusip = unique(cusip)[which.max(tabulate(match(cusip, unique(cusip))))],
            flpr_envrisk = mean(PRiskT_environment,na.rm = T),.groups = "drop")

dat <- left_join(dat %>% mutate(gvkey = as.integer(gvkey)),
          risk2 %>% select(gvkey, year, flpr_envrisk) %>% mutate(year = as.numeric(year)))

dat[which(is.na(dat$flpr_envrisk)),] <- left_join(dat[which(is.na(dat$flpr_envrisk)),] %>% select(-flpr_envrisk),
                                                        risk2 %>% filter(gvkey %in% dat$gvkey==F & cusip != "") %>% select(cusip, year, flpr_envrisk) %>% mutate(year = as.numeric(year))
                                                        )

# 7. Sector carbon emissions intensity-------------------------------------
# Use average direct emissions intensity data (1998, 2002, 2006) from 
# Henry, David, Beethika Khan, and Sandra Cooke-Hull. 2010. 
# U.S. Carbon Dioxide Emissions and Intensities over Time: A Detailed Accounting of Industries, Government, and Households. 
# Technical report, Economic and Statistics Administration.

co2 <- read_csv("data/esa_ghg_naics6.csv")

# average direct emissions intensity across 1998, 2002, and 2006
co2 <- co2 %>%
  group_by(naics_2012_code) %>%
  summarize(emissions_direct_intensity = mean(emissions_direct_intensity,na.rm = T)) %>%
  rename(naics = naics_2012_code)

dat <- left_join(dat, co2)

# Adjust direct emissions intensity variable so that the emissions intensity at the 80th percentile = 1
dat$emissions_direct_intensity <- dat$emissions_direct_intensity/quantile(dat$emissions_direct_intensity,probs = 0.8,na.rm = T)

# Clean up dat dataframe
dat <- dat %>% select(-name)

# Save working files
save(dat,file = "working_files/dat_temp")
load("working_files/dat_temp")
load("working_files/interlocks_df_temp")

# 8. Interlock-weighted measures -------------------------------------

## 8.1 Load parallelized functions  -------------------------------------

# For calculating hamming distance
sedist_parallel <- function (dat, g = c(1:dim(dat)[1]), cl=cl) {
  require(sna)
  dat <- as.sociomatrix.sna(dat, simplify = TRUE)
  
  n <- dim(dat)[2]
  m <- 1
  d <- array(dim = c(m, n, n))
  d[1, , ] <- dat
  
  o <- array(dim = c(m, n, n))
  for (k in 1:m) {
    clusterExport(cl=cl,list("n"))
    
    i_j <- pbsapply(cl=cl,1:n,function(i){
      v_i <- c(as.vector(d[k,i,]),as.vector(d[k,,i]))
      sapply(1:n, function(j){
        v_j <- c(as.vector(d[k,j,]),as.vector(d[k,,j]))
        sum(abs(v_i - v_j),na.rm = T)
      })
    })
    
    o[k,,] <- i_j
  }
  
  if (dim(o)[1] == 1) {
    as.matrix(o[1, , ])
  } else {
    o
  }
}

# For calculating structural equivalence (from Hadden and Jasny (2017)
makeSEW<-function(l){
  res<-matrix(0,nrow(l),ncol(l))
  for(i in 1:nrow(l)){
    for(j in 1:ncol(l)){
      res[i,j]<-l[j,i]/(1-l[i,i])
    }
  }
  res
}

makeSEL <- function(cl=cl,mat=mat){
  require(pbapply)
  out <- pbsapply(cl=cl,1:nrow(mat),function(j){
    max_j <- max(mat[,j])
    s <- sapply(1:nrow(mat),function(i){
      (max_j - mat[i,j])^1
    })
    s/sum(s)
  })
  out
}

## 8.2 Create function for calculating interlock-weighted measures  -------------------------------------
# number of interlocks, number of indirect interlocks, eigenvector centrality
# interlock-weighted DVs, indirect interlock-weighted DVs, structural equivalence-weighted DVs

interlock_measures_func <- function(dat = dat, interlocks = interlock_df,
                                    working_file_path = "working_files/", analysis_file_path = "", file_prefix = ""){
  cl <- makeCluster(detectCores()-1)
  clusterEvalQ(cl=cl, list(library(pbapply)))
  
  dat$num_interlocks <- NA; dat$num_indinterlocks <- NA; dat$num_indinterlocks_alt <- NA; dat$evc <- NA
  dat$cso_exists_ilwtd <- NA; dat$cso_exists_indilwtd <- NA; dat$cso_exists_indilwtd_alt <- NA; dat$cso_exists_sewtd <- NA; 
  dat$cdp_report_ilwtd <- NA; dat$cdp_report_indilwtd <- NA; dat$cdp_report_indilwtd_alt <- NA; dat$cdp_report_sewtd <- NA; 
  dat$numsupcoal_ilwtd <- NA; dat$numsupcoal_indilwtd <- NA; dat$numsupcoal_indilwtd_alt <- NA; dat$numsupcoal_sewtd <- NA; 
  dat$numoppcoal_ilwtd <- NA; dat$numoppcoal_indilwtd <- NA; dat$numoppcoal_indilwtd_alt <- NA; dat$numoppcoal_sewtd <- NA; 
  dat$lob_ilwtd <- NA; dat$lob_indilwtd <- NA; dat$lob_indilwtd_alt <- NA; dat$lob_sewtd <- NA
  dat$lobsup_ilwtd <- NA; dat$lobsup_indilwtd <- NA; dat$lobsup_indilwtd_alt <- NA; dat$lobsup_sewtd <- NA
  dat$lobopp_ilwtd <- NA; dat$lobopp_indilwtd <- NA; dat$lobopp_indilwtd_alt <- NA; dat$lobopp_sewtd <- NA
  
  for(yr in 1995:2019){
    # yr <- 2016
    # Create matrix of interlocks that persist in the next year
    interlocks_yr <- interlocks %>%
      filter(year == yr) %>%
      select(source, receiver, n_interlocks) %>%
      pivot_wider(id_cols = receiver, 
                  names_from = source, 
                  values_from = n_interlocks, 
                  values_fill = 0)
    
    interlocks_yr <- interlocks_yr[,c("receiver",sort(colnames(interlocks_yr)[-1]))]
    
    # Convert to sparse matrix
    int_mat <- as.matrix(interlocks_yr[,-1])
    rownames(int_mat) <- colnames(int_mat)
    int_mat <- Matrix(int_mat,sparse = T)
    
    # Create indirect interlock matrix by taking crossproduct
    ind_int_mat <- as.matrix(crossprod(int_mat, int_mat))
    diag(ind_int_mat) <- 0
    
    # Alternative formulation of indirect interlock matrix (binarize and remove indirect interlocks that also have direct interlocks)
    ind_int_mat_alt <- ind_int_mat
    ind_int_mat_alt[as.matrix(int_mat)>0] <- 0
    ind_int_mat_alt[ind_int_mat_alt>0] <- 1
    
    # Create structural equivalence matrix using the hamming distance between each pair of firms
    se_int_mat_dist <- as.matrix(sedist_parallel(cl = cl, dat = as.matrix(int_mat)))
    rownames(se_int_mat_dist) <- colnames(se_int_mat_dist) <- rownames(int_mat)
    mat <- as.matrix(se_int_mat_dist)
    clusterExport(cl=cl, list("mat"),envir = environment())
    se_int_mat <- makeSEW(makeSEL(cl = cl, mat = mat))
    diag(se_int_mat) <- 0
    rownames(se_int_mat) <- colnames(se_int_mat) <- rownames(int_mat)
    
    # Create interlock-weighted measures
    # ilwtd = interlock-weighted
    # indilwtd = indirect interlock-weighted
    # sewtd = structural equivalence-weighted
    # evc = eigenvector centrality
    
    # Keep only interlocks that have associated Compustat data
    cur_yrfirm <- dat$year == yr & (dat$gvkey %in% rownames(int_mat))
    cur_yrfirm_gvkey <- dat$gvkey[cur_yrfirm]
    int_mat <- int_mat[as.character(cur_yrfirm_gvkey),as.character(cur_yrfirm_gvkey)]
    ind_int_mat <- ind_int_mat[as.character(cur_yrfirm_gvkey),as.character(cur_yrfirm_gvkey)]
    ind_int_mat_alt <- ind_int_mat_alt[as.character(cur_yrfirm_gvkey),as.character(cur_yrfirm_gvkey)]
    se_int_mat <- se_int_mat[as.character(cur_yrfirm_gvkey),as.character(cur_yrfirm_gvkey)]
    
    # # Eigenvector centrality
    evc <- tryCatch(tibble(evc = evcent(as.matrix(int_mat),rescale = T,use.eigen = T)) %>% mutate(evc = if_else(evc < .0000001,0,evc)),
                    error = function(e){tibble(evc = evcent(as.matrix(int_mat),rescale = T,use.eigen = F)) %>% mutate(evc = if_else(evc < .0000001,0,evc))})
    evc <- evc$evc
                    
    # Interlocks
    dat[dat$year == yr, c("num_interlocks","num_indinterlocks","num_indinterlocks_alt","evc")] <- 0
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "num_interlocks"] <- apply(int_mat, 1, sum, na.rm = TRUE)
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "num_indinterlocks"] <- apply(ind_int_mat, 1, sum, na.rm = TRUE)
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "num_indinterlocks_alt"] <- apply(ind_int_mat_alt, 1, sum, na.rm = TRUE)
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "evc"] <- evc
    
    # cso_exists
    dat[dat$year == yr, c("cso_exists_ilwtd","cso_exists_indilwtd","cso_exists_indilwtd_alt","cso_exists_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "cso_exists"]; cur_var[is.na(cur_var)] <- 0
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cso_exists_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cso_exists_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cso_exists_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cso_exists_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # cdp_report
    dat[dat$year == yr, c("cdp_report_ilwtd","cdp_report_indilwtd","cdp_report_indilwtd_alt","cdp_report_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "cdp_report"]; cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cdp_report_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cdp_report_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cdp_report_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "cdp_report_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # numsupcoal
    dat[dat$year == yr, c("numsupcoal_ilwtd","numsupcoal_indilwtd","numsupcoal_indilwtd_alt","numsupcoal_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "numsupcoal"]; cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numsupcoal_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numsupcoal_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numsupcoal_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numsupcoal_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # numoppcoal
    dat[dat$year == yr, c("numoppcoal_ilwtd","numoppcoal_indilwtd","numoppcoal_indilwtd_alt","numoppcoal_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "numoppcoal"]; cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numoppcoal_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numoppcoal_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numoppcoal_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "numoppcoal_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # lob
    dat[dat$year == yr, c("lob_ilwtd","lob_indilwtd","lob_indilwtd_alt","lob_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "lob"]; cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lob_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lob_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lob_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lob_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # lobsup
    dat[dat$year == yr, c("lobsup_ilwtd","lobsup_indilwtd","lobsup_indilwtd_alt","lobsup_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "lob"]*(dat[cur_yrfirm,"numsupcoal"] >0)*(dat[cur_yrfirm,"numoppcoal"] ==0); cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobsup_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobsup_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobsup_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobsup_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    
    # lobopp
    dat[dat$year == yr, c("lobopp_ilwtd","lobopp_indilwtd","lobopp_indilwtd_alt","lobopp_sewtd")] <- 0
    cur_var <- dat[cur_yrfirm, "lob"]*(dat[cur_yrfirm,"numoppcoal"] >0)*(dat[cur_yrfirm,"numsupcoal"] ==0); cur_var[is.na(cur_var)] <- 0  
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobopp_ilwtd"] <- as.matrix(crossprod(int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobopp_indilwtd"] <- as.matrix(crossprod(ind_int_mat, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobopp_indilwtd_alt"] <- as.matrix(crossprod(ind_int_mat_alt, Matrix(as.matrix(cur_var))))
    dat[dat$year == yr & (dat$gvkey %in% rownames(int_mat)), "lobopp_sewtd"] <- as.matrix(crossprod(se_int_mat, Matrix(as.matrix(cur_var))))
    print(yr)
    
    # Save file for the relevant year
    write_csv(dat %>% filter(year == yr),file = paste0(working_file_path, "analysis_data_",file_prefix,"_",yr,".csv"))
  }
  
  # Combine years
  dat <- read_csv(paste0(working_file_path,"analysis_data_",file_prefix,"_","1995.csv"))
                  
  files <- list.files(working_file_path)
  files <- files[grep(files,pattern = file_prefix)]
  files <- files[-grep(files,pattern = "1995|placeholder")]

  for(f in files){
    temp <- read_csv(paste0(working_file_path,f),show_col_types = F)
    dat <- bind_rows(dat,temp)
  }
  
  dat <- dat %>% 
    distinct() %>%
    arrange(gvkey, year)
  
  # Save analysis dataframe
  write_csv(dat, file = paste0(analysis_file_path,"analysis_data_",file_prefix,".csv"))
  
  stopCluster(cl)
}

# 9. Generate analysis dataframes -------------------------------------
# Dropping firms without revenue data
dat_revt_drop <- dat %>% filter(is.na(revt)==F)
interlock_measures_func(dat = dat_revt_drop,
                        interlocks = interlocks_df %>% filter(source %in% dat_revt_drop$gvkey & receiver %in% dat_revt_drop$gvkey),
                        file_prefix = "revt_drop")


# Using all firms
interlock_measures_func(dat = dat,
                        interlocks = interlocks_df,
                        file_prefix = "all_firms")


# Remove python environment
conda_remove("r-reticulate_boards_rep")


